## Error: package 'xts' required by 'PerformanceAnalytics' could not be found

1 Where Do People Drink The Most Beer, Wine And Spirits?

Back in 2014, fivethiryeight.com published an article on alchohol consumption in different countries. The data drinks is available as part of the fivethirtyeight package. Make sure you have installed the fivethirtyeight package before proceeding.

library(fivethirtyeight)
data(drinks)


# or download directly
alcohol_direct <- read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/alcohol-consumption/drinks.csv")

The skim function was useful for understanding the variable types and checking for missing values.

skim(alcohol_direct)
Data summary
Name alcohol_direct
Number of rows 193
Number of columns 5
_______________________
Column type frequency:
character 1
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
country 0 1 3 28 0 193 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
beer_servings 0 1 106.16 101.14 0 20.0 76.0 188.0 376.0 ▇▃▂▂▁
spirit_servings 0 1 80.99 88.28 0 4.0 56.0 128.0 438.0 ▇▃▂▁▁
wine_servings 0 1 49.45 79.70 0 1.0 8.0 59.0 370.0 ▇▁▁▁▁
total_litres_of_pure_alcohol 0 1 4.72 3.77 0 1.3 4.2 7.2 14.4 ▇▃▅▃▁
# There are two variable types, numeric and character. The country column is a character type and the other four column (beer, wine, spirit, and total) are numeric. 
# No missing values to worry about. 

Plot that shows the top 25 beer consuming countries

top_25_beer <- glimpse(alcohol_direct) %>%
  arrange(desc(beer_servings)) %>% 
  slice(1:25)
## Rows: 193
## Columns: 5
## $ country                      <chr> "Afghanistan", "Albania", "Algeria", "An…
## $ beer_servings                <dbl> 0, 89, 25, 245, 217, 102, 193, 21, 261, …
## $ spirit_servings              <dbl> 0, 132, 0, 138, 57, 128, 25, 179, 72, 75…
## $ wine_servings                <dbl> 0, 54, 14, 312, 45, 45, 221, 11, 212, 19…
## $ total_litres_of_pure_alcohol <dbl> 0.0, 4.9, 0.7, 12.4, 5.9, 4.9, 8.3, 3.8,…
ggplot(top_25_beer, aes(x =fct_reorder(country,beer_servings),y=beer_servings))+
  geom_col()+
  geom_smooth(se= FALSE)+
  theme(plot.title = element_text(vjust = 1, size = 5),axis.text.x = element_text(angle = 90))+
  labs(title = "Top 25 Beer Consuming Countries", x= "Countries", y= "Beer Servings" )+
  NULL

Plot that shows the top 25 wine consuming countries

top_25_wine <- glimpse(alcohol_direct) %>%
  arrange(desc(wine_servings)) %>% 
  slice(1:25)
## Rows: 193
## Columns: 5
## $ country                      <chr> "Afghanistan", "Albania", "Algeria", "An…
## $ beer_servings                <dbl> 0, 89, 25, 245, 217, 102, 193, 21, 261, …
## $ spirit_servings              <dbl> 0, 132, 0, 138, 57, 128, 25, 179, 72, 75…
## $ wine_servings                <dbl> 0, 54, 14, 312, 45, 45, 221, 11, 212, 19…
## $ total_litres_of_pure_alcohol <dbl> 0.0, 4.9, 0.7, 12.4, 5.9, 4.9, 8.3, 3.8,…
ggplot(top_25_wine, aes(x =fct_reorder(country,wine_servings),y=wine_servings))+
  geom_col()+
  geom_smooth(se= FALSE)+
  theme(plot.title = element_text(vjust = 1, size = 5),axis.text.x = element_text(angle = 90))+
  labs(title = "Top 25 Wine Consuming Countries", x= "Countries", y= "Wine Servings" )+
  NULL

Plot that shows the top 25 spirit consuming countries

top_25_spirit <- glimpse(alcohol_direct) %>%
  arrange(desc(spirit_servings)) %>% 
  slice(1:25)
## Rows: 193
## Columns: 5
## $ country                      <chr> "Afghanistan", "Albania", "Algeria", "An…
## $ beer_servings                <dbl> 0, 89, 25, 245, 217, 102, 193, 21, 261, …
## $ spirit_servings              <dbl> 0, 132, 0, 138, 57, 128, 25, 179, 72, 75…
## $ wine_servings                <dbl> 0, 54, 14, 312, 45, 45, 221, 11, 212, 19…
## $ total_litres_of_pure_alcohol <dbl> 0.0, 4.9, 0.7, 12.4, 5.9, 4.9, 8.3, 3.8,…
ggplot(top_25_spirit, aes(x =fct_reorder(country,spirit_servings),y=spirit_servings))+
  geom_col()+
  geom_smooth(se= FALSE)+
  theme(plot.title = element_text(vjust = 1, size = 5),axis.text.x = element_text(angle = 90))+
  labs(title = "Top 25 Spirit Consuming Countries", x= "Countries", y= "Spirit Servings" )+
  NULL

The graphs depict that, on average, Namibians consumer the most beer, the French consume the most wine, and Grenadians consume the most spirits. These results may be a consequence of availability of particular ingredients (sugarcane,hops, barley, grapes), implications of specific climates, or cultural influences. Other countries have increasingly stringent or relaxed regulations on the different types of alcohol which alter the consumption habits of consumers.

2 Analysis of movies- IMDB dataset

We will look at a subset sample of movies, taken from the Kaggle IMDB 5000 movie dataset

This loads the dataset movies.

movies <- read_csv("movies.csv")
glimpse(movies)

Skim was used to check for missing value, of which there were no missing values. However, there are duplicate values for the three character type columns.

skim(movies)
## Error in skim(movies): object 'movies' not found

This is used for displaying a series of tables concerning movies.

# Table with the count of movies by genre, ranked in descending order
movies %>% 
  group_by(genre) %>% 
  count(sort = TRUE)
## Error in eval(lhs, parent, parent): object 'movies' not found
# Table with the average gross earning and budget (`gross` and `budget`) by genre
movies %>% 
  group_by(genre) %>% 
  summarize(mean_budget = mean(budget), mean_gross_earning = mean(gross), return_on_budget = mean_gross_earning/mean_budget) %>%
  arrange(desc(return_on_budget))
## Error in eval(lhs, parent, parent): object 'movies' not found
# Table that shows the top 15 directors who have created the highest gross revenue in the box office

top_15_directors <- movies %>%
  group_by(director) %>% 
  summarize(mean_director = mean(gross), standard_dev = sd(gross), median = median(gross), total = sum(gross)) %>% 
  arrange(desc(total)) %>% 
  slice(1:15)
## Error in eval(lhs, parent, parent): object 'movies' not found
top_15_directors
## Error in eval(expr, envir, enclos): object 'top_15_directors' not found
# Table that describes how ratings are distributed by genre.
rating <- movies %>%
  group_by(genre) %>% 
  summarize(min = min(rating), max = max(rating), median = median(rating), standard_dev = sd(rating))
## Error in eval(lhs, parent, parent): object 'movies' not found
ggplot(movies, aes(x = rating))+
  geom_density()+
  labs(title = "Ratings Density", colour = rating)
## Error in ggplot(movies, aes(x = rating)): object 'movies' not found

We have placed “gross” on the y-axis and “cast_facebook_likes” on the x-axis. The amount of Facebook likes the cast has received is likely not the strongest indicator of high gross revenues.

ggplot(movies, aes(x = cast_facebook_likes, y= gross))+
  geom_point()+
  scale_x_log10()+
  scale_y_log10()
## Error in ggplot(movies, aes(x = cast_facebook_likes, y = gross)): object 'movies' not found
# If you have higher earning... more likes 

The correlation between gross revenues and budget seems to be stronger than the previous correlation with Facebook likes.

ggplot(movies, aes(x = gross, y= budget))+
  geom_point()+
  scale_x_log10()+
  scale_y_log10()
## Error in ggplot(movies, aes(x = gross, y = budget)): object 'movies' not found
# Higher budget ... more earnings

There seems to be a small relationship between gross revenue and rating when faceted by genre. Depending on the genre the correlation slightly changes, or there isn’t enough data in particular genres to make a determination.

ggplot(movies, aes(x = gross, y= rating))+
  geom_point()+
  facet_wrap(vars(genre))+
  scale_x_log10()+
  scale_y_log10()
## Error in ggplot(movies, aes(x = gross, y = rating)): object 'movies' not found
# Higher rating = more money

Returns of financial stocks

nyse <- read_csv("nyse.csv")

Table and a bar plot that shows the number of companies per sector, in descending order

chart_sector <- nyse %>% 
  group_by (sector) %>% 
  summarise(number_of_companies = n()) %>% 
  arrange(desc(number_of_companies))

ggplot (chart_sector, aes( x = fct_reorder(sector, -number_of_companies), y = number_of_companies))+
  geom_col()+
  labs(title = "Companies Per Sector", x= "Sector", y= "Number Per Sector" )+
  theme(axis.text.x = element_text(angle = 90))

Gathers the tickers from Wikipedia

djia_url <- "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average"

#get tables that exist on URL
tables <- djia_url %>% 
  read_html() %>% 
  html_nodes(css="table")


# parse HTML tables into a dataframe called djia. 
# Use purr::map() to create a list of all tables in URL
djia <- map(tables, . %>% 
               html_table(fill=TRUE)%>% 
               clean_names())


# constituents
table1 <- djia[[2]] %>% # the second table on the page contains the ticker symbols
  mutate(date_added = ymd(date_added),
         
         # if a stock is listed on NYSE, its symbol is, e.g., NYSE: MMM
         # We will get prices from yahoo finance which requires just the ticker
         
         # if symbol contains "NYSE*", the * being a wildcard
         # then we jsut drop the first 6 characters in that string
         ticker = ifelse(str_detect(symbol, "NYSE*"),
                          str_sub(symbol,7,11),
                          symbol)
         )

# we need a vector of strings with just the 30 tickers + SPY
tickers <- table1 %>% 
  select(ticker) %>% 
  pull() %>% # pull() gets them as a sting of characters
  c("SPY") # and lets us add SPY, the SP500 ETF

Price data on stocks

# Notice the cache=TRUE argument in the chunk options. Because getting data is time consuming, # cache=TRUE means that once it downloads data, the chunk will not run again next time you knit your Rmd

myStocks <- tickers %>% 
  tq_get(get  = "stock.prices",
         from = "2000-01-01",
         to   = "2020-08-31") %>%
  group_by(symbol) 
## Error in tq_get(., get = "stock.prices", from = "2000-01-01", to = "2020-08-31"): could not find function "tq_get"
glimpse(myStocks) # examine the structure of the resulting data frame
## Error in glimpse(myStocks): object 'myStocks' not found

Calculation of different return periods

#calculate daily returns
myStocks_returns_daily <- myStocks %>%
  tq_transmute(select     = adjusted, 
               mutate_fun = periodReturn, 
               period     = "daily", 
               type       = "log",
               col_rename = "daily_returns",
               cols = c(nested.col))  
## Error in eval(lhs, parent, parent): object 'myStocks' not found
#calculate monthly  returns
myStocks_returns_monthly <- myStocks %>%
  tq_transmute(select     = adjusted, 
               mutate_fun = periodReturn, 
               period     = "monthly", 
               type       = "arithmetic",
               col_rename = "monthly_returns",
               cols = c(nested.col)) 
## Error in eval(lhs, parent, parent): object 'myStocks' not found
#calculate yearly returns
myStocks_returns_annual <- myStocks %>%
  group_by(symbol) %>%
  tq_transmute(select     = adjusted, 
               mutate_fun = periodReturn, 
               period     = "yearly", 
               type       = "arithmetic",
               col_rename = "yearly_returns",
               cols = c(nested.col))
## Error in eval(lhs, parent, parent): object 'myStocks' not found

Summarized monthly returns since 2017-01-01 for each of the stocks and SPY; min, max, median, mean, SD.

summarise_monthly_returns <- myStocks_returns_monthly %>%
  dplyr::group_by(symbol) %>%
  dplyr::filter(date >= as.Date("2017-01-01")) %>%
  summarise(min_return = min(monthly_returns), max_return = max(monthly_returns), median_return = median(monthly_returns), mean_return = mean(monthly_returns), std_return = sd(monthly_returns))
## Error in eval(lhs, parent, parent): object 'myStocks_returns_monthly' not found
summarise_monthly_returns
## Error in eval(expr, envir, enclos): object 'summarise_monthly_returns' not found

Density plot, using geom_density(), for each of the stocks

ggplot(myStocks_returns_monthly, aes( x = monthly_returns)) +
  geom_density() +
  facet_wrap(~symbol) +
  labs(title = "Density plot of the distribution of monthly returns",
    y = "Density", x = "Monthly Returns") +
    theme(plot.title = element_text(hjust = 0.5, size = 12, face = "bold"))
## Error in ggplot(myStocks_returns_monthly, aes(x = monthly_returns)): object 'myStocks_returns_monthly' not found

We can infer from the plot that the “SPY” is the lest risky since the monthly return distribution is the most narrow, whereas the “DOW” has the widest distribution making it the most risky.

Plot that shows the expected monthly return (mean) of a stock on the Y axis and the risk (standard deviation) in the X-axis.

myStocks_returns_monthly %>% 
  group_by(symbol) %>% 
  summarise(mean=mean(monthly_returns),sd=sd(monthly_returns)) %>%
  ggplot(aes(x=sd,y=mean,label=symbol)) +
  geom_point() +
  geom_smooth() +
  ggrepel::geom_text_repel(show.legend = FALSE,size = 5) +
  labs(title = "Expected Monthly Returns vs Risk ",
    y = "Expected monthly returns",
    x = "Risk") +
    theme(plot.title = element_text(hjust = 0.5, size = 10, face = "bold"))
## Error in eval(lhs, parent, parent): object 'myStocks_returns_monthly' not found

From this plot we can say that usually when the risk is low the expected monthly return is also low (e.g. SPY, JNJ) and vice versa (e.g. AAPL, CRM). However, DOW is risky but the expected monthly return is not very high. On the other hand, V, UNH and NKE have high expected return compared to their risk profile.

3 On your own: IBM HR Analytics

For this task, you will analyse a data set on Human Resoruce Analytics. The IBM HR Analytics Employee Attrition & Performance data set is a fictional data set created by IBM data scientists. Among other things, the data set includes employees’ income, their distance from work, their position in the company, their level of education, etc. A full description can be found on the website.

hr_dataset <- read_csv("datasets_1067_1925_WA_Fn-UseC_-HR-Employee-Attrition.csv")
glimpse(hr_dataset)
## Rows: 1,470
## Columns: 35
## $ Age                      <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, …
## $ Attrition                <chr> "Yes", "No", "Yes", "No", "No", "No", "No", …
## $ BusinessTravel           <chr> "Travel_Rarely", "Travel_Frequently", "Trave…
## $ DailyRate                <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358…
## $ Department               <chr> "Sales", "Research & Development", "Research…
## $ DistanceFromHome         <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26,…
## $ Education                <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3,…
## $ EducationField           <chr> "Life Sciences", "Life Sciences", "Other", "…
## $ EmployeeCount            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ EmployeeNumber           <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16…
## $ EnvironmentSatisfaction  <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3,…
## $ Gender                   <chr> "Female", "Male", "Male", "Female", "Male", …
## $ HourlyRate               <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, …
## $ JobInvolvement           <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2,…
## $ JobLevel                 <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1,…
## $ JobRole                  <chr> "Sales Executive", "Research Scientist", "La…
## $ JobSatisfaction          <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3,…
## $ MaritalStatus            <chr> "Single", "Married", "Single", "Married", "M…
## $ MonthlyIncome            <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 26…
## $ MonthlyRate              <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 996…
## $ NumCompaniesWorked       <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5,…
## $ Over18                   <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
## $ OverTime                 <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes"…
## $ PercentSalaryHike        <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, …
## $ PerformanceRating        <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3,…
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2,…
## $ StandardHours            <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, …
## $ StockOptionLevel         <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0,…
## $ TotalWorkingYears        <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, …
## $ TrainingTimesLastYear    <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4,…
## $ WorkLifeBalance          <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3,…
## $ YearsAtCompany           <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4…
## $ YearsInCurrentRole       <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2,…
## $ YearsSinceLastPromotion  <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0,…
## $ YearsWithCurrManager     <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3,…

Clean the data set

hr_cleaned <- hr_dataset %>% 
  clean_names() %>% 
  mutate(
    education = case_when(
      education == 1 ~ "Below College",
      education == 2 ~ "College",
      education == 3 ~ "Bachelor",
      education == 4 ~ "Master",
      education == 5 ~ "Doctor"
    ),
    environment_satisfaction = case_when(
      environment_satisfaction == 1 ~ "Low",
      environment_satisfaction == 2 ~ "Medium",
      environment_satisfaction == 3 ~ "High",
      environment_satisfaction == 4 ~ "Very High"
    ),
    job_satisfaction = case_when(
      job_satisfaction == 1 ~ "Low",
      job_satisfaction == 2 ~ "Medium",
      job_satisfaction == 3 ~ "High",
      job_satisfaction == 4 ~ "Very High"
    ),
    performance_rating = case_when(
      performance_rating == 1 ~ "Low",
      performance_rating == 2 ~ "Good",
      performance_rating == 3 ~ "Excellent",
      performance_rating == 4 ~ "Outstanding"
    ),
    work_life_balance = case_when(
      work_life_balance == 1 ~ "Bad",
      work_life_balance == 2 ~ "Good",
      work_life_balance == 3 ~ "Better",
      work_life_balance == 4 ~ "Best"
    )
  ) %>% 
  select(age, attrition, daily_rate, department,
         distance_from_home, education,
         gender, job_role,environment_satisfaction,
         job_satisfaction, marital_status,
         monthly_income, num_companies_worked, percent_salary_hike,
         performance_rating, total_working_years,
         work_life_balance, years_at_company,
         years_since_last_promotion)

Summary describing this dataset.

#1.  Shows often do people leave the company (`attrition`)

attrition_rate<-hr_cleaned %>% 
  mutate(people_leave=ifelse(attrition=="Yes",1,0))%>% 
  summarize(attrition_rate=sum(people_leave)/count(hr_cleaned))

#2. Shows how `age`, `years_at_company`, `monthly_income` and `years_since_last_promotion` distributed. Looking at the summary statistics, we can see "Age" is the variable closer to normal distribution since its mean and median are closer together.

summary(hr_cleaned %>% 
  select(age,years_at_company,monthly_income,years_since_last_promotion))
##       age       years_at_company monthly_income  years_since_last_promotion
##  Min.   :18.0   Min.   : 0       Min.   : 1009   Min.   : 0.00             
##  1st Qu.:30.0   1st Qu.: 3       1st Qu.: 2911   1st Qu.: 0.00             
##  Median :36.0   Median : 5       Median : 4919   Median : 1.00             
##  Mean   :36.9   Mean   : 7       Mean   : 6503   Mean   : 2.19             
##  3rd Qu.:43.0   3rd Qu.: 9       3rd Qu.: 8379   3rd Qu.: 3.00             
##  Max.   :60.0   Max.   :40       Max.   :19999   Max.   :15.00
#3. Shows how `job_satisfaction` and `work_life_balance` distributed. 

hr_cleaned %>% 
  group_by(job_satisfaction) %>% 
  summarise(total= n()) %>% 
  mutate(percentage_of_total = total / sum(total))
## # A tibble: 4 x 3
##   job_satisfaction total percentage_of_total
##   <chr>            <int>               <dbl>
## 1 High               442               0.301
## 2 Low                289               0.197
## 3 Medium             280               0.190
## 4 Very High          459               0.312
hr_cleaned %>% 
  group_by(work_life_balance) %>% 
  summarise(total= n()) %>% 
  mutate(percentage_of_total = total / sum(total))
## # A tibble: 4 x 3
##   work_life_balance total percentage_of_total
##   <chr>             <int>               <dbl>
## 1 Bad                  80              0.0544
## 2 Best                153              0.104 
## 3 Better              893              0.607 
## 4 Good                344              0.234
#4. Relationship between monthly income and education, as well as monthly income and gender. Individuals with a masters or doctorate have higher incomes than those with other degrees. Females tend to receive slightly higher monthly income.

hr_cleaned %>% 
  ggplot(aes(x= education , y= monthly_income)) +
   geom_boxplot()+
   labs(title = "Relationship between monthly income and education",subtitle ="People with higher education tend to receive higher monthly income",x = "Education",y = "Monthly Income")

hr_cleaned %>% 
  ggplot(aes(x= gender , y= monthly_income)) +
   geom_boxplot()+
   labs(title = "Relationship between monthly income and gender",subtitle ="Female tend to recive slightly higher monthly income",
    x = "Education",
    y = "Monthly Income")

#5. Boxplot of income vs job role, in descending order. 

hr_cleaned %>% 
  ggplot(aes(x= reorder(job_role,-monthly_income) , y=monthly_income )) +
   geom_boxplot()+
  theme(plot.title = element_text(vjust = 1, size = 5),axis.text.x = element_text(angle = 90))+
   labs(title = "Relationship between monthly income and job role",
    y = "Monthly Income",
    x = "Job role")

#6. Bar chart of the mean (or median?) income by education level.

mean_income <- hr_cleaned %>% 
  group_by(education) %>% 
  summarise(mean_income=mean(monthly_income))

ggplot(mean_income, aes(x=reorder(education,-mean_income),y=mean_income))+
  geom_col()+
   labs(title = "Relationship between mean monthly income and education level",
    y = "Mean Monthly Income",
    x = "Education")

median_income <- hr_cleaned %>% 
  group_by(education) %>% 
  summarise(median_income=median(monthly_income))

ggplot(median_income, aes(x=reorder(education,-median_income),y=median_income))+
  geom_col()+
   labs(title = "Relationship between median monthly income and education level",
    y = "Median Monthly Income",
    x = "Education")

#7. The distribution of income by education level.

ggplot(hr_cleaned, aes(x=monthly_income))+
  geom_density()+
  facet_wrap(vars(education))+
  labs(title = "Distribution of monthly income by education level ",
    x= "Monthly Income",
    y="Density")

#8. Plot of income vs age, faceted by `job_role`

ggplot(hr_cleaned, aes(x=age,y=monthly_income))+
  geom_point()+
  facet_wrap(vars(job_role))+
  labs(title = "Relationship between monthly income and age by job role ",
    x= "Age",
    y="Monthly Income")+scale_x_log10()+
  scale_y_log10()

4 Challenge 1: Replicating a chart

Figure 3 shows the homicide and suicide rate among white men.

Creating a replication of figure 3 (homicide and suicide rate among white men).

##   ST    State Population.Black Population.White Deaths.homicide.Black
## 1 AK   Alaska           153664          2274605                    23
## 2 AK   Alaska           153664          2274605                    NA
## 3 AL  Alabama          5376635         14267619                   183
## 4 AL  Alabama          5376635         14267619                  1812
## 5 AR Arkansas          1991165          9747025                   667
## 6 AR Arkansas          1991165          9747025                    90
##   Deaths.homicide.White crude.homicide.Black crude.homicide.White
## 1                    78                14.97                 3.43
## 2                    10                   NA                   NA
## 3                   124                 3.40                 0.87
## 4                   620                33.70                 4.35
## 5                   416                33.50                 4.27
## 6                   110                 4.52                 1.13
##   adjusted.homicide.Black adjusted.homicide.White Deaths.suicide.Black
## 1                   12.30                    3.24                   23
## 2                      NA                      NA                   12
## 3                    3.51                    0.85                  148
## 4                   33.00                    4.47                  370
## 5                   33.39                    4.39                  118
## 6                    4.81                    1.13                   75
##   Deaths.suicide.White crude.suicide.Black crude.suicide.White
## 1                  535               14.97               23.52
## 2                  196                  NA                8.62
## 3                 1222                2.75                8.56
## 4                 3195                6.88               22.39
## 5                 2154                5.93               22.10
## 6                 1027                3.77               10.54
##   adjusted.suicide.Black adjusted.suicide.White        type crude.RD.suicide
## 1                  12.77                  23.47     Firearm            -8.55
## 2                     NA                   8.21 Non-Firearm               NA
## 3                   2.84                   8.64 Non-Firearm            -5.81
## 4                   7.20                  20.97     Firearm           -15.51
## 5                   6.25                  20.98     Firearm           -16.17
## 6                   3.93                  10.89 Non-Firearm            -6.77
##   adj.RD.suicide crude.RD.homicide adj.RD.homicide ST.order.RD.homicide
## 1         -10.70             11.54            9.06                   AK
## 2             NA                NA              NA                   AK
## 3          -5.80              2.53            2.66                   AL
## 4         -13.77             29.35           28.53                   AL
## 5         -14.73             29.23           29.00                   AR
## 6          -6.96              3.39            3.68                   AR
##   ST.order.RD.suicide gun.house.prev gun.house.prev.category average.pop.white
## 1                  AK           59.8             45.0%-65.5%            252734
## 2                  AK           59.8             45.0%-65.5%            252734
## 3                  AL           52.2             45.0%-65.5%           1585291
## 4                  AL           52.2             45.0%-65.5%           1585291
## 5                  AR           58.8             45.0%-65.5%           1083003
## 6                  AR           58.8             45.0%-65.5%           1083003
##   average.pop.black          type.fac
## 1             17074   Firearm-related
## 2             17074 Firearm-unrelated
## 3            597404 Firearm-unrelated
## 4            597404   Firearm-related
## 5            221241   Firearm-related
## 6            221241 Firearm-unrelated
## Rows: 104
## Columns: 28
## $ ST                      <chr> "AK", "AK", "AL", "AL", "AR", "AR", "AZ", "AZ…
## $ State                   <chr> "Alaska", "Alaska", "Alabama", "Alabama", "Ar…
## $ Population.Black        <int> 153664, 153664, 5376635, 5376635, 1991165, 19…
## $ Population.White        <int> 2274605, 2274605, 14267619, 14267619, 9747025…
## $ Deaths.homicide.Black   <int> 23, NA, 183, 1812, 667, 90, 300, 40, 3712, 43…
## $ Deaths.homicide.White   <int> 78, 10, 124, 620, 416, 110, 586, 145, 1399, 6…
## $ crude.homicide.Black    <dbl> 14.97, NA, 3.40, 33.70, 33.50, 4.52, 22.06, 2…
## $ crude.homicide.White    <dbl> 3.43, NA, 0.87, 4.35, 4.27, 1.13, 3.46, 0.86,…
## $ adjusted.homicide.Black <dbl> 12.30, NA, 3.51, 33.00, 33.39, 4.81, 20.40, 2…
## $ adjusted.homicide.White <dbl> 3.24, NA, 0.85, 4.47, 4.39, 1.13, 3.63, 0.91,…
## $ Deaths.suicide.Black    <int> 23, 12, 148, 370, 118, 75, 101, 80, 492, 639,…
## $ Deaths.suicide.White    <int> 535, 196, 1222, 3195, 2154, 1027, 4146, 1972,…
## $ crude.suicide.Black     <dbl> 14.97, NA, 2.75, 6.88, 5.93, 3.77, 7.43, 5.88…
## $ crude.suicide.White     <dbl> 23.52, 8.62, 8.56, 22.39, 22.10, 10.54, 24.47…
## $ adjusted.suicide.Black  <dbl> 12.77, NA, 2.84, 7.20, 6.25, 3.93, 7.51, 5.58…
## $ adjusted.suicide.White  <dbl> 23.47, 8.21, 8.64, 20.97, 20.98, 10.89, 21.42…
## $ type                    <chr> "Firearm", "Non-Firearm", "Non-Firearm", "Fir…
## $ crude.RD.suicide        <dbl> -8.55, NA, -5.81, -15.51, -16.17, -6.77, -17.…
## $ adj.RD.suicide          <dbl> -10.70, NA, -5.80, -13.77, -14.73, -6.96, -13…
## $ crude.RD.homicide       <dbl> 11.54, NA, 2.53, 29.35, 29.23, 3.39, 18.60, 2…
## $ adj.RD.homicide         <dbl> 9.06, NA, 2.66, 28.53, 29.00, 3.68, 16.77, 1.…
## $ ST.order.RD.homicide    <chr> "AK", "AK", "AL", "AL", "AR", "AR", "AZ", "AZ…
## $ ST.order.RD.suicide     <chr> "AK", "AK", "AL", "AL", "AR", "AR", "AZ", "AZ…
## $ gun.house.prev          <dbl> 59.8, 59.8, 52.2, 52.2, 58.8, 58.8, 32.3, 32.…
## $ gun.house.prev.category <chr> "45.0%-65.5%", "45.0%-65.5%", "45.0%-65.5%", …
## $ average.pop.white       <dbl> 252734, 252734, 1585291, 1585291, 1083003, 10…
## $ average.pop.black       <dbl> 17074, 17074, 597404, 597404, 221241, 221241,…
## $ type.fac                <chr> "Firearm-related", "Firearm-unrelated", "Fire…
## Rows: 49
## Columns: 5
## $ ST                      <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DE…
## $ adjusted.suicide.White  <dbl> 23.47, 20.97, 20.98, 21.42, 11.63, 19.48, 6.4…
## $ adjusted.homicide.White <dbl> 3.24, 4.47, 4.39, 3.63, 2.05, 1.75, 0.86, 1.7…
## $ gun.house.prev.category <chr> "45.0%-65.5%", "45.0%-65.5%", "45.0%-65.5%", …
## $ average.pop.white       <dbl> 252734, 1585291, 1083003, 1882345, 7703022, 1…

5 Challenge 2: 2016 California Contributors plots

Displays a figure we need to reproduce that shows the top ten cities in highest amounts raised in political contributions in California during the 2016 US Presidential election.

To get this plot, you must join two dataframes; the one you have with all contributions, and data that can translate zipcodes to cities. You can find a file with all US zipcodes, e.g., here http://www.uszipcodelist.com/download.html.

The easiest way would be to create two plots and then place one next to each other. For this, you will need the patchwork package. https://cran.r-project.org/web/packages/patchwork/index.html

While this is ok, what if one asked you to create the same plot for the top 10 candidates and not just the top two? The most challenging part is how to reorder within categories, and for this you will find Julia Silge’s post on REORDERING AND FACETTING FOR GGPLOT2 useful.

# Make sure you use vroom() as it is significantly faster than read.csv()

CA_contributors_2016 <- vroom::vroom(here::here("data","CA_contributors_2016.csv"))
zipcodes_2016 <- vroom::vroom(here::here("data","zip_code_database.csv"))


# Mutating the zipcodes and merging with CA_contributors_2016 dataset. 
zipcodes_2016 <- zipcodes_2016 %>%
    mutate(zip=as.double(zip))
merged_dataframe <- left_join( CA_contributors_2016, zipcodes_2016, by="zip")

# Table showing total amount of contribution received by city in descending order. 
merged_dataframe %>%
  group_by(primary_city) %>%
  summarise(total_amount=sum(contb_receipt_amt)) %>% arrange(desc(total_amount)) %>% slice(1:10)
## # A tibble: 10 x 2
##    primary_city  total_amount
##    <chr>                <dbl>
##  1 Los Angeles      16608405.
##  2 San Francisco    15716677.
##  3 Oakland           3940076.
##  4 San Diego         3932830.
##  5 Palo Alto         3375370.
##  6 Beverly Hills     3267148.
##  7 Berkeley          3104403.
##  8 Santa Monica      2926874.
##  9 San Jose          2445544.
## 10 Sacramento        2384442.
library(patchwork)
library(dplyr)

# Table showing total contributions received by candidate in descending order.
merged_dataframe %>%
  group_by(cand_nm) %>%
  summarise(total_amount=sum(contb_receipt_amt)) %>% arrange(desc(total_amount)) %>% slice(1:10)
## # A tibble: 10 x 2
##    cand_nm                   total_amount
##    <chr>                            <dbl>
##  1 Clinton, Hillary Rodham      94784569.
##  2 Sanders, Bernard             20649084.
##  3 Trump, Donald J.             14064776.
##  4 Cruz, Rafael Edward 'Ted'     6075175.
##  5 Rubio, Marco                  5595281.
##  6 Bush, Jeb                     3357550.
##  7 Carson, Benjamin S.           3145278.
##  8 Kasich, John R.               1571849.
##  9 Fiorina, Carly                1501113.
## 10 Paul, Rand                     823542.
# Table showing the top ten cities where Hillary Clinton received the most contributions.
CA_2016_Clinton <- merged_dataframe %>% filter(cand_nm == "Clinton, Hillary Rodham") %>% group_by(primary_city) %>%
      summarise(total_amount=sum(contb_receipt_amt)) %>% arrange(desc(total_amount)) %>%
      mutate(primary_city = fct_reorder(primary_city, total_amount))%>%  slice (1:10)

CA_2016_Clinton
## # A tibble: 10 x 2
##    primary_city  total_amount
##    <fct>                <dbl>
##  1 San Francisco    12345963.
##  2 Los Angeles      12022399.
##  3 Oakland           2955387.
##  4 Palo Alto         2664346.
##  5 Berkeley          2300980.
##  6 Beverly Hills     2226442.
##  7 Santa Monica      2134503.
##  8 San Diego         2094960.
##  9 Sacramento        1623772.
## 10 Los Altos         1500963.
# Table showing the top ten cities where Donald Trump received the most contributions.
CA_2016_trump <- merged_dataframe %>% filter(cand_nm == "Trump, Donald J.") %>% group_by(primary_city) %>%
      summarise(total_amount=sum(contb_receipt_amt)) %>% arrange(desc(total_amount))%>%
      mutate(primary_city = fct_reorder(primary_city, total_amount))%>%  slice (1:10)

CA_2016_trump
## # A tibble: 10 x 2
##    primary_city    total_amount
##    <fct>                  <dbl>
##  1 Los Angeles          544464.
##  2 San Diego            482885.
##  3 Newport Beach        347206.
##  4 Bakersfield          244419.
##  5 San Francisco        230712.
##  6 San Jose             190652.
##  7 Fresno               182674.
##  8 Rancho Santa Fe      178233.
##  9 Sacramento           171656.
## 10 Irvine               157518.
# Plot showing the top ten cities where Hillary Clinton received the most contributions in in descending order.
Plot_clinton<- ggplot(CA_2016_Clinton, aes(x=total_amount, y=primary_city)) + 
  geom_col()  + 
  theme_minimal() + 
  labs(x ="" , y = "", title = "Clinton, Hillary Rodham") 

Plot_clinton

# Plot showing the top ten cities where Donald Trump received the most contributions in in descending order.
Plot_trump<- ggplot(CA_2016_trump, aes(x=total_amount, y=primary_city)) + 
  geom_col()  + 
  theme_minimal() + 
  labs(x ="" , y = "", title = "Trump, Donald J.") 

Plot_trump

library(patchwork)

# Compares the top ten cities where Donald Trump and Hillary Clinton received the most contributions in in descending order, displaying them side by side.
wrap_plots(Plot_clinton, Plot_trump)

library(tidytext)

# Shows the top ten candidates ranked by contributions to their campaigns in descending order. 
top_cand <- merged_dataframe %>%
  group_by(cand_nm) %>%
  summarise(total_amount=sum(contb_receipt_amt)) %>% arrange(desc(total_amount)) %>% slice(1:10)

top_cand
## # A tibble: 10 x 2
##    cand_nm                   total_amount
##    <chr>                            <dbl>
##  1 Clinton, Hillary Rodham      94784569.
##  2 Sanders, Bernard             20649084.
##  3 Trump, Donald J.             14064776.
##  4 Cruz, Rafael Edward 'Ted'     6075175.
##  5 Rubio, Marco                  5595281.
##  6 Bush, Jeb                     3357550.
##  7 Carson, Benjamin S.           3145278.
##  8 Kasich, John R.               1571849.
##  9 Fiorina, Carly                1501113.
## 10 Paul, Rand                     823542.
top_cand_10 <-merged_dataframe %>%
    filter(cand_nm %in% top_cand$cand_nm) %>% 
    group_by(primary_city, cand_nm) %>% 
    summarise(total_amount =sum(contb_receipt_amt)) %>% 
    group_by(cand_nm) %>%
    slice_max(total_amount,n=10) %>% 
    ungroup %>% 
    mutate(cand_nm = as.factor(cand_nm),
    primary_city = reorder_within(primary_city, total_amount, cand_nm)) 

# Plots the top ten candidates by contributions received by their campaign across primary cities in descending order.
Plot_top_10 <- ggplot(top_cand_10, aes(primary_city, total_amount)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~cand_nm, scales = "free") +
    coord_flip() +
    scale_x_reordered() +
    theme_bw()+
    scale_y_continuous(expand = c(0,0)) + 
    theme (axis.title.x = element_blank(), 
        axis.title.y = element_blank(),
        axis.text.x = element_text(angle = 90))

Plot_top_10

6 Details

  • Who did you collaborate with: Group 12: Chushi Guo, Erkka Salo, Joshua Nemy, Julien Vermeersch, Marta Maccagno, and Raymond Zexin Wu
  • Approximately how much time did you spend on this problem set: 14-16 hours cumulative
  • What, if anything, gave you the most trouble: Challenge #2 was quite tricky